!! Copyright (C) 2009,2010,2011,2012  Marco Restelli
!!
!! This file is part of:
!!   FEMilaro -- Finite Element Method toolkit
!!
!! FEMilaro is free software; you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 3 of the License, or
!! (at your option) any later version.
!!
!! FEMilaro is distributed in the hope that it will be useful, but
!! WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
!! General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with FEMilaro; If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Marco Restelli                   <marco.restelli@gmail.com>

!> \file
!! DG compressible Navier-Stokes euqations with stratification.
!!
!! \n
!!
!! This is a simple version of the discontinuous Galerkin method for
!! atmospheric problems. The main features of this implementation
!! are:
!! <ul>
!!  <li> simplicial elements
!!  <li> explicit time integration
!!  <li> boundary conditions are either free slip or open boundaries
!! </ul>
!!
!! The name of the input file can be specified as command argument,
!! otherwise the default \c input_file_name_def is used.
!<
program dg_comp_main

 !$ use mod_omp_utils, only: &
 !$   mod_omp_utils_constructor, &
 !$   mod_omp_utils_destructor,  &
 !$   mod_omp_utils_initialized, &
 !$   detailed_timing_omp, &
 !$   omput_push_key,      &
 !$   omput_pop_key,       &
 !$   omput_start_timer,   &
 !$   omput_close_timer,   &
 !$   omput_write_time

 use mod_utils, only: &
   t_realtime, my_second

 use mod_messages, only: &
   mod_messages_constructor, &
   mod_messages_destructor,  &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   mod_kinds_constructor, &
   mod_kinds_destructor,  &
   wp

 use mod_fu_manager, only: &
   mod_fu_manager_constructor, &
   mod_fu_manager_destructor,  &
   new_file_unit

 use mod_mpi_utils, only: &
   mod_mpi_utils_constructor, &
   mod_mpi_utils_destructor,  &
   mpi_logical, mpi_integer, wp_mpi, &
   mpi_comm_world,                   &
   mpi_init, mpi_finalize,           &
   mpi_comm_size, mpi_comm_rank,     &
   mpi_bcast, mpi_gather

 use mod_octave_io, only: &
   mod_octave_io_constructor, &
   mod_octave_io_destructor,  &
   real_format, &
   write_octave, read_octave_al

 use mod_sympoly, only: &
   mod_sympoly_constructor, &
   mod_sympoly_destructor

 use mod_octave_io_sympoly, only: &
   mod_octave_io_sympoly_constructor, &
   mod_octave_io_sympoly_destructor

 use mod_linal, only: &
   mod_linal_constructor, &
   mod_linal_destructor

 use mod_perms, only: &
   mod_perms_constructor, &
   mod_perms_destructor

 use mod_octave_io_perms, only: &
   mod_octave_io_perms_constructor, &
   mod_octave_io_perms_destructor

 use mod_output_control, only: &
   mod_output_control_constructor, &
   mod_output_control_destructor,  &
   elapsed_format, &
   base_name

 use mod_master_el, only: &
   mod_master_el_constructor, &
   mod_master_el_destructor

 use mod_grid, only: &
   mod_grid_constructor, &
   mod_grid_destructor,  &
   t_grid, t_ddc_grid,   &
   new_grid, clear,      &
   affmap, locate_point, &
   write_octave

 use mod_bcs, only: &
   mod_bcs_constructor, &
   mod_bcs_destructor,    &
   t_bcs, new_bcs, clear, &
   write_octave
 
 use mod_sections, only: &
  mod_sections_constructor, &
  mod_sections_destructor,  &
  t_section_collection,   &
  new_section_collection, &
  clear, write_octave

 use mod_numquad, only: &
   mod_numquad_constructor, &
   mod_numquad_destructor

 use mod_base, only: &
   mod_base_constructor, &
   mod_base_destructor,  &
   t_base, clear, &
   write_octave

 use mod_fe_spaces, only: &
   mod_fe_spaces_constructor, &
   mod_fe_spaces_destructor,  &
   dg_scal

 use mod_state_vars, only: &
   mod_state_vars_constructor, &
   mod_state_vars_destructor

 use mod_time_integrators, only: &
   mod_time_integrators_constructor, &
   mod_time_integrators_destructor,  &
   c_ode, c_ods, c_tint, t_ti_init_data, t_ti_step_diag, &
   t_ee, t_hm, t_rk4, t_ssprk54, &
   t_thetam, t_bdf1, t_bdf2, t_bdf3, t_bdf2ex, &
   t_erb2kry, t_erb2lej, &
   t_pcexpo

 use mod_physical_constants, only: &
   mod_physical_constants_constructor, &
   mod_physical_constants_destructor

 use mod_dgcomp_testcases, only: &
   mod_dgcomp_testcases_constructor, &
   mod_dgcomp_testcases_destructor,  &
   test_name, test_description, &
   ntrcs, ref_prof, t_ref, &
   coeff_init,   &
   coeff_outref

 use mod_atm_refstate, only: &
   mod_atm_refstate_constructor, &
   mod_atm_refstate_destructor,  &
   t_atm_refstate, atm_ref, &
   t_atm_refstate_e, atm_ref_e, &
   write_octave

 use mod_dgcomp_flowstate, only: &
   mod_dgcomp_flowstate_constructor, &
   mod_dgcomp_flowstate_destructor,  &
   update_flowstate, &
   mass_flow, kinetic_energy

 use mod_turb_flux, only: &
   mod_turb_flux_constructor, &
   mod_turb_flux_destructor,  &
   c_turbmod_progs, c_turbmod_diags, t_turb_diags, &
   t_viscous_flux, t_smagorinsky_flux

 use mod_dgcomp_rhs, only: &
   mod_dgcomp_rhs_constructor, &
   mod_dgcomp_rhs_destructor,  &
   t_bcs_error, &
   dgcomp_tens, &
   compute_courant

 use mod_dgcomp_sponges, only: &
   mod_dgcomp_sponges_constructor, &
   mod_dgcomp_sponges_destructor,  &
   use_sponges, &
   sponge_apply_splitted

 use mod_dgcomp_statistics, only: &
   mod_dgcomp_statistics_constructor, &
   mod_dgcomp_statistics_destructor,  &
   update_stats

 use mod_dgcomp_ode, only: &
   mod_dgcomp_ode_constructor, &
   mod_dgcomp_ode_destructor,  &
   t_dgcomp_stv, t_dgcomp_ode, t_dgcomp_ods, &
   new_dgcomp_stv, new_dgcomp_ode, clear

!----------------------------------------------------------------------
 
 implicit none

!----------------------------------------------------------------------
 
 ! Main parameters
 character(len=*), parameter :: this_prog_name = 'dg-comp'
 integer, parameter :: max_char_len = 1000

 ! Main variables

 ! ODE variables and state vectors
 integer :: nvar ! number of variables
 type(t_dgcomp_stv) :: uuu0, uuun
 type(t_dgcomp_ode) :: ode
 type(t_dgcomp_ods) :: ods
 type(t_ti_init_data) :: ti_init_data
 type(t_ti_step_diag) :: ti_step_diag
 class(c_tint), allocatable :: tint
 type(t_turb_diags) :: turb_diags
 real(wp), allocatable :: uuu_io(:,:,:), uuu_init(:,:,:), im_init(:)

 ! FE space
 integer :: k, k_dyn
 type(t_base), target :: base

 ! Grid
 integer :: nbound
 character(len=max_char_len) :: grid_file, cbc_type
 type(t_grid    ), target :: grid
 type(t_ddc_grid)         :: ddc_grid
 type(t_bcs), target :: bcs
 integer, allocatable :: bc_type_t(:,:)

 ! Statistics
 character(len=max_char_len) :: section_file, section_var
 type(t_section_collection) :: sections

 ! Time stepping
 logical :: l_checkpoint, l_output, l_statstics
 integer :: nstep, n, n_out
 real(wp) :: tt_sta, tt_end, dt, t_n, t_nm1, time_last_out, dt_out, &
   time_last_stats, dt_stats, time_last_check, dt_check
 
 ! Test case
 character(len=max_char_len) :: testname
 logical :: viscous_flow
 character(len=max_char_len) :: turb_model
 character(len=max_char_len) :: integrator

 ! Initial condition
 integer :: ie, i, h
 real(wp), allocatable :: u_r(:,:), uuue(:,:)
 logical :: init_from_file
 character(len=max_char_len) :: init_file_name

 ! IO
 character(len=*), parameter :: input_file_name_def = 'dg-comp.in'
 character(len=max_char_len) :: input_file_name
 character(len=max_char_len) :: basename
 character(len=*), parameter :: out_file_nml_suff = '-nml.octxt'
 character(len=max_char_len+len(out_file_nml_suff)):: out_file_nml_name
 logical :: write_grid
 character(len=*), parameter :: out_file_grid_suff = '-grid.octxt'
 character(len=10000+len(out_file_grid_suff)) :: out_file_grid_name
 character(len=*), parameter :: out_file_base_suff = '-base.octxt'
 character(len=10000+len(out_file_base_suff)) :: out_file_base_name
 character(len=*), parameter :: out_file_initial_suff = '-init.octxt'
 character(len=10000+len(out_file_initial_suff))::out_file_initial_name
 ! additional output at selected times
 integer :: n_sot, nn_sot ! counter and number of special output times
 real(wp), allocatable :: selected_output_times(:)
 ! fast output at selected points
 logical :: l_output_fop
 integer :: np_fop ! number of fast output points
 integer, allocatable :: ie_fop(:)
 real(wp) :: dt_fop, time_last_out_fop
 real(wp), allocatable :: xx_fop(:,:)

 ! MPI variables
 integer :: mpi_nd, mpi_id

 ! Auxiliary variables
 integer :: fu, ierr
 real(t_realtime) :: t00, t0, t1, t0step
 character(len=100*max_char_len) :: message(11)

 ! Input namelist
 namelist /input/ &
   ! test case
   testname,     &
   viscous_flow, &
   turb_model,   &
   integrator,   &
   init_from_file, init_file_name, &
   ! output base name
   basename, &
   ! base
   k, &
   !base dyn
   k_dyn, &
   ! grid
   write_grid,  &
   grid_file,   &
   ! boundary conditions
   nbound, cbc_type, &
   ! sections (use an empty section list for none)
   section_file, section_var, &
   ! time stepping
   tt_sta, tt_end, dt, &
   ! output
   dt_out, &
   ! chepoints
   dt_check, &
   ! statistics
   dt_stats, &
   ! special output times
   nn_sot, & ! number of times in selected_output_times (possibly 0)
   ! fast output points
   dt_fop, np_fop

 ! Auxiliary namelist (to make the allocation)
 namelist /out_details/ &
   selected_output_times, &
   xx_fop


 !---------------------------------------------------------------------
 ! Initializations and startup
 t00 = my_second()

 call mod_messages_constructor()

 !$ if(detailed_timing_omp) call mod_omp_utils_constructor()

 call mod_kinds_constructor()

 call mpi_init(ierr)
 call mod_mpi_utils_constructor()
 call mpi_comm_size(mpi_comm_world,mpi_nd,ierr)
 call mpi_comm_rank(mpi_comm_world,mpi_id,ierr)

 call mod_fu_manager_constructor()

 call mod_octave_io_constructor()

 call mod_sympoly_constructor()
 call mod_octave_io_sympoly_constructor()

 call mod_linal_constructor()

 call mod_perms_constructor()
 call mod_octave_io_perms_constructor()
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Read input file
 if(command_argument_count().gt.0) then
   call get_command_argument(1,value=input_file_name)
 else
   input_file_name = input_file_name_def
 endif

 call new_file_unit(fu,ierr)
 open(fu,file=trim(input_file_name), &
      status='old',action='read',form='formatted',iostat=ierr)
 read(fu,input)
 close(fu,iostat=ierr)
 ! If there are more processes, each of them needs its own basename
 ! and grid
 if(mpi_nd.gt.1) then
   write(basename, '(a,a,i4.4)') trim(basename),'-P',mpi_id
   write(grid_file,'(a,a,i4.4)') trim(grid_file),'.',mpi_id
 endif
 ! echo the input namelist
 out_file_nml_name = trim(basename)//out_file_nml_suff
 open(fu,file=trim(out_file_nml_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
 write(fu,input)
 close(fu,iostat=ierr)

 allocate(bc_type_t(nbound,2))
 read(cbc_type,*) bc_type_t ! transposed, for simplicity
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! output control setup
 call mod_output_control_constructor(basename)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Construct the grid
 t0 = my_second()

 call mod_master_el_constructor()

 call mod_grid_constructor()
 call new_grid(grid,trim(grid_file) , ddc_grid,mpi_comm_world)

 ! write the octave output
 if(write_grid) then
   out_file_grid_name = trim(base_name)//out_file_grid_suff
   call new_file_unit(fu,ierr)
   open(fu,file=trim(out_file_grid_name), &
        status='replace',action='write',form='formatted',iostat=ierr)
    call write_octave(grid    ,'grid'    ,fu)
    call write_octave(ddc_grid,'ddc_grid',fu)
   close(fu,iostat=ierr)
 endif

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed grid construction: elapsed time ',t1-t0,'s.'
 if(mpi_id.eq.0) &
   call info(this_prog_name,'',message(1))
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Read the remaining input namelists
 call new_file_unit(fu,ierr)
 open(fu,file=trim(input_file_name), &
      status='old',action='read',form='formatted',iostat=ierr)
  allocate(selected_output_times(nn_sot),xx_fop(grid%m,np_fop))
  read(fu,out_details)
 close(fu,iostat=ierr)
 ! echo the input namelist
 open(fu,file=trim(out_file_nml_name), status='old', & 
   action='readwrite',form='formatted',position='append',iostat=ierr)
  write(fu,out_details)
 close(fu,iostat=ierr)
 ! the easiest way to deal with the nn_sot.eq.0 case is the following
 if(nn_sot.eq.0) then
   nn_sot = 1
   deallocate(selected_output_times); 
   allocate(selected_output_times(1))
   selected_output_times = 2.0_wp*tt_end
 endif
 call locate_point(ie_fop,xx_fop,grid); deallocate(xx_fop)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Collect the information concerning the boundary conditions
 call mod_bcs_constructor()
 call new_bcs(bcs,grid,transpose(bc_type_t) , ddc_grid,mpi_comm_world)
 deallocate( bc_type_t )
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Define the "sections" for the diagnostics
 call mod_sections_constructor()
 call new_section_collection( sections , section_file , section_var , &
                                grid , ddc_grid , mpi_comm_world )
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Define the finite element space
 t0 = my_second()

 call mod_numquad_constructor()

 call mod_base_constructor()

 call mod_fe_spaces_constructor()

 base = dg_scal( grid%me , k )

 ! write the octave output
 out_file_base_name = trim(base_name)//out_file_base_suff
 call new_file_unit(fu,ierr)
 open(fu,file=trim(out_file_base_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
   call write_octave(base,'base',fu)
   if(write_grid) then
     call write_octave(bcs,'bcs',fu)
   endif
 close(fu,iostat=ierr)

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed FE basis construction: elapsed time ',t1-t0,'s.'
 if(mpi_id.eq.0) &
   call info(this_prog_name,'',message(1))
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Physical data setup
 call mod_physical_constants_constructor()
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Set up the test case
 call mod_dgcomp_testcases_constructor(testname,grid%d)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Allocate working arrays
 nvar = 1 + 1 + grid%d ! number of variables: density, energy, momentum
 nvar = nvar + ntrcs   ! include the tracers
 allocate( uuu_io(nvar,base%pk,grid%ne) )
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Set up the reference state

 call mod_atm_refstate_constructor(grid,base,ref_prof,t_ref, &
        mpi_comm_world,ddc_grid)

 ! Set the the output reference state
 allocate(u_r(nvar,base%m),uuue(nvar,base%m))
 u_r = 0.0_wp
 do ie=1,grid%ne
   u_r(1,:) = atm_ref_e(:,ie)%rho
   u_r(2,:) = atm_ref_e(:,ie)%e
   ! the momentum is always zero in the reference state
   ! there are no tracers in the reference state
   uuue = coeff_outref(affmap(grid%e(ie),base%xig),u_r)
   ! projection: use that the bais is orthogonal
   do i=1,base%pk
     do h=1,nvar
       uuu_io(h,i,ie) = sum( base%wg * uuue(h,:) * base%p(i,:) )
     enddo
   enddo
 enddo
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Other intializations
 call mod_state_vars_constructor()
 call mod_dgcomp_flowstate_constructor(grid,mpi_comm_world)
 call mod_turb_flux_constructor(base, grid, k_dyn)
 call mod_dgcomp_rhs_constructor(grid,ddc_grid,viscous_flow, &
                                 mpi_comm_world,base)

 call mod_dgcomp_sponges_constructor(grid,base,dt)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Allocate time integration variables
 call mod_time_integrators_constructor()
 call mod_dgcomp_ode_constructor(ddc_grid%nd)
 call new_dgcomp_ode(ode,grid,base,bcs,viscous_flow,mpi_comm_world)
 call new_dgcomp_stv(uuu0,grid,base,bcs,nvar,mpi_comm_world)
 ! uuun will be defined later, including the turbulence model

 ! Define the turbulence model
 if(viscous_flow) then
   turb_mod_case: select case(trim(turb_model))
    case('viscous')
     allocate(t_viscous_flux::ode%turbmod)
    case('smagorinsky')
     allocate(t_smagorinsky_flux::ode%turbmod) 
    case default
     call error(this_prog_name,"", &
          'Unknown turbulence model "'//trim(turb_model)//'".')
   end select turb_mod_case
   call ode%turbmod%init(uuu0%turbmod_progs,ods%turbmod_diags, &
                         turb_diags,grid,base,ntrcs)
 else
   turb_diags%ndiags = 0
   allocate(turb_diags%diag_names(0),turb_diags%diags(0,0,0))
 endif
 ! Now we can allocate uuun
 ! compiler bug: the assignment could cause problem as in DPD200243378
 ! with intel compiler, so for the time being it is better to set the
 ! fields one by one
 !uuun = uuu0
 call new_dgcomp_stv(uuun,grid,base,bcs,nvar,mpi_comm_world)
 if(allocated(uuu0%turbmod_progs)) &
   allocate(uuun%turbmod_progs,source=uuu0%turbmod_progs)

 call mod_dgcomp_statistics_constructor(sections,grid,base)

 ! Set the initial condition
 initial_cond_if: if(init_from_file) then
   ! If there is only one subdomain, init_file_name is used as the
   ! name of the initial condition file. If there are more subdomains,
   ! however, each of them needs a separate file. In this case,
   ! init_file_name is used as a format to obtain the file name of
   ! each processor. In practice, one would specify
   !   init_file_name = '("outfile"i3.3"-res-0123.octxt")'
   ! to read "outfile000-res-0123.octxt", "outfile001-res-0123.octxt"
   ! "outfile002-res-0123.octxt", ... on the various processors. This
   ! is useful to use the results of previous runs directly as input
   ! files.
   if(mpi_nd.gt.1) then
     message(1) = trim(init_file_name)
     write(init_file_name,trim(message(1))) mpi_id
   endif
   if(mpi_id.eq.0) &
     call info(this_prog_name,'','Reading initial condition from "' &
                // trim(init_file_name) // '".')
   call new_file_unit(fu,ierr)
   open(fu,file=trim(init_file_name), &
        status='old',action='read',form='formatted',iostat=ierr)
   if(ierr.ne.0) call error(this_prog_name,'',  &
     'Problem opening initial condition from "' &
     // trim(init_file_name) // '".')
   call read_octave_al(uuu_init,'uuu',fu)
   uuu0%u = 0.0_wp
   uuu0%u(:,1:min(size(uuu0%u,2),size(uuu_init,2)),:) =          &
              uuu_init(:,1:min(size(uuu0%u,2),size(uuu_init,2)),:)
   deallocate(uuu_init)
   call read_octave_al(im_init,'im',fu)
   uuu0%im = im_init
   deallocate(im_init)
   close(fu,iostat=ierr)
   ! complete the input including the IO reference state
   uuu0%u = uuu0%u + uuu_io
 else
   u_r = 0.0_wp
   do ie=1,grid%ne
     u_r(1,:) = atm_ref_e(:,ie)%rho
     u_r(2,:) = atm_ref_e(:,ie)%e
     uuue = coeff_init(affmap(grid%e(ie),base%xig),u_r)
     do i=1,base%pk
       do h=1,nvar
         uuu0%u(h,i,ie) = sum( base%wg * uuue(h,:) * base%p(i,:) )
       enddo
     enddo
   enddo
   uuu0%im = 0.0_wp
 endif initial_cond_if
 deallocate(u_r,uuue)

 ! write the octave output: initial setup
 out_file_initial_name = trim(base_name)//out_file_initial_suff
 call new_file_unit(fu,ierr)
 open(fu,file=trim(out_file_initial_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
   call write_octave(atm_ref      ,'atm_ref'  ,fu)
   call write_octave(uuu_io       ,'uuu_ioref',fu)
   call write_octave(uuu0%u-uuu_io,'uuu0'     ,fu)
   call write_octave(uuu0%im  ,'c','im'       ,fu)
 close(fu,iostat=ierr)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Setup for the fast output
 if(size(ie_fop).gt.0) &
   call write_fop_output( trim(base_name) , t_n , uuu0%u , .true. )
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Time integration
 !
 !   0        1                 n             nstep
 !   |--------|--------|--------|--------|------|
 ! tt_sta            t_nm1     t_n            tt_end
 !                     | step_n |
 !
 
 ! Set time integration method
 time_int_case: select case(trim(integrator))
  case('rk')
   allocate(t_ssprk54::tint)
  case('exp')
   allocate(t_erb2kry::tint)
  case default
   call error(this_prog_name, '',&
    'Unknown integration method "'//trim(integrator)//'".')
 end select time_int_case
 
 
 call init_time_stepping()
 ti_init_data%dim = 25
 ti_init_data%tol = 1.0e-5_wp
 ti_init_data%mpi_id   = mpi_id
 ti_init_data%mpi_comm = mpi_comm_world

 ! Set-up the model
 call tint%init( ode,dt,tt_sta,uuu0,ods,               &
             init_data=ti_init_data , step_diag=ti_step_diag )
 time_loop: do n=1,nstep

   !$ if(detailed_timing_omp) then
   !$   call omput_push_key("TimeStep")
   !$   call omput_start_timer()
   !$ endif
   t0 = my_second()
   t0step = t0

   call init_time_step() ! setup and synchronization

   !-------------------------------------------------------------------
   ! Update the turbulent model
   !  TODO
   !-------------------------------------------------------------------

   !-------------------------------------------------------------------
   ! Update the statistics
   !
   ! This is done here so that we can include statistics related to
   ! the turbulent model - this can not be done after the time step,
   ! because the turbulent model would be outdated.
   if(l_statstics) call update_stats(sections,t_nm1,uuu0,ods,ode)
   !-------------------------------------------------------------------

   !-------------------------------------------------------------------
   ! Compute the time step
   !$ if(detailed_timing_omp) then
   !$   call omput_push_key("TimeStep")
   !$   call omput_start_timer()
   !$ endif
   call tint%step(uuun,ode,t_nm1,uuu0,ods,ti_step_diag)
   if(uuun%bcerr%lerr) call error(this_prog_name,"",uuun%bcerr%message)
   call uuu0%copy( uuun )
   !$ if(detailed_timing_omp) then ! TimeStep
   !$   call omput_write_time()
   !$   call omput_close_timer()
   !$   call omput_pop_key()
   !$ endif
   !-------------------------------------------------------------------

   !-------------------------------------------------------------------
   ! Open boundary conditions: tracers are not affected
   !$ if(detailed_timing_omp) then
   !$   call omput_push_key("OpenBoundaries")
   !$   call omput_start_timer()
   !$ endif
   if(use_sponges) &
     call sponge_apply_splitted( uuu0%u(1:2+grid%d,:,:) , grid , base )
   !$ if(detailed_timing_omp) then ! OpenBoundaries
   !$   call omput_write_time()
   !$   call omput_close_timer()
   !$   call omput_pop_key()
   !$ endif
   !-------------------------------------------------------------------

   !-------------------------------------------------------------------
   ! Checkpoint
   if(l_checkpoint) then
     t1 = my_second()
     call check_step(t1-t0step , grid , ddc_grid , base , dt , &
                     uuu0%u , ti_step_diag)
   endif
   !-------------------------------------------------------------------

   !-------------------------------------------------------------------
   ! Output
   if(l_output) then
     t1 = my_second()
     call write_step( n, nstep, t1-t0step, n_out, trim(base_name), &
         t_n,uuu0%u,uuu0%im, uuu0%turbmod_progs,ods%turbmod_diags, &
         dt, ti_step_diag)
   endif
   if((size(ie_fop).gt.0).and.l_output_fop) &
     call write_fop_output( trim(base_name) , t_n , uuu0%u , .false. )
   !-------------------------------------------------------------------

   !$ if(detailed_timing_omp) then ! TimeStep
   !$   call omput_write_time()
   !$   call omput_close_timer()
   !$   call omput_pop_key()
   !$ endif
 enddo time_loop
 call tint%clean(ode,ti_step_diag)
 !---------------------------------------------------------------------

 !---------------------------------------------------------------------
 ! Clean up
 deallocate( selected_output_times )
 if(viscous_flow) then
   call ode%turbmod%clean(ods%turbmod_diags,turb_diags)
   deallocate(ode%turbmod)
 endif
 call clear(uuun)
 call clear(uuu0)
 deallocate( uuu_io )

 ! one should finalize the turbulence model and turb_diags

 call mod_dgcomp_statistics_destructor()

 call mod_dgcomp_ode_destructor()

 call mod_time_integrators_destructor()

 call mod_dgcomp_sponges_destructor()

 call mod_dgcomp_rhs_destructor()

 call mod_turb_flux_destructor()

 call mod_dgcomp_flowstate_destructor()

 call mod_state_vars_destructor()

 call mod_atm_refstate_destructor()

 call mod_dgcomp_testcases_destructor()

 call mod_physical_constants_destructor()

 call mod_fe_spaces_destructor()
  
 call clear( base )
 call mod_base_destructor()

 call mod_numquad_destructor()
  
 call clear( sections )
 call mod_sections_destructor()
  
 call clear( bcs )
 call mod_bcs_destructor()
  
 call clear( grid )
 call clear( ddc_grid )
 call mod_grid_destructor()
  
 call mod_master_el_destructor()
  
 call mod_output_control_destructor()
 deallocate(ie_fop)
  
 call mod_octave_io_perms_destructor()
 call mod_perms_destructor()
  
 call mod_linal_destructor()
  
 call mod_octave_io_sympoly_destructor()
 call mod_sympoly_destructor()
  
 call mod_octave_io_destructor()

 call mod_fu_manager_destructor()
  
 call mod_mpi_utils_destructor()
 call mpi_finalize(ierr)
  
 call mod_kinds_destructor()

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Done: elapsed time ',t1-t00,'s.'
 if(mpi_id.eq.0) &
   call info(this_prog_name,'',message(1))

 !$ if(detailed_timing_omp) call mod_omp_utils_destructor()

 call mod_messages_destructor()
 !---------------------------------------------------------------------

contains

 !---------------------------------------------------------------------

 subroutine check_step(wc_timestep,grid,ddc_grid,base,dt,uuu,sd)
  real(t_realtime), intent(in) :: wc_timestep
  type(t_grid), intent(in) :: grid
  type(t_ddc_grid), intent(in) :: ddc_grid
  type(t_base), intent(in) :: base
  real(wp), intent(in) :: dt, uuu(:,:,:)
  type(t_ti_step_diag), intent(in) :: sd

  integer, parameter :: sdim = 10 ! MPI message size
  integer :: nlines, imax, ierr
  real(wp) :: vtot
  real(wp), allocatable :: c_tot(:), c_adv(:,:)
  real(wp), allocatable :: sbuf(:), rbuf(:,:)
  character(len=3+5+8) :: tmp_format

   ! Compute the Courant number
   allocate(c_adv(2,grid%ne),c_tot(grid%ne))
   call compute_courant(c_tot,c_adv,grid,base,dt,uuu)

   ! Maxima on the local subdomain
   allocate(sbuf(sdim))
   imax = maxloc(c_tot,1)
   sbuf(1) = real(imax,wp)         ! max position
   sbuf(2) = c_tot(imax)           ! max value
   sbuf(3) = sum(grid%e%vol*c_tot) ! average
   imax = maxloc(c_adv(1,:),1)
   sbuf(4) = real(imax,wp)
   sbuf(5) = c_adv(1,imax)
   sbuf(6) = sum(grid%e%vol*c_adv(1,:))
   imax = maxloc(c_adv(2,:),1)
   sbuf(7) = real(imax,wp)
   sbuf(8) = c_adv(2,imax)
   sbuf(9) = sum(grid%e%vol*c_adv(2,:))
   sbuf(10) = grid%vol

   ierr = 0 ! used as temporary
   if(mpi_id.eq.0) ierr = ddc_grid%nd
   allocate(rbuf(sdim,ierr)) ! better to always allocate arrays
   call mpi_gather(sbuf,sdim,wp_mpi, rbuf,sdim,wp_mpi, 0, &
                   mpi_comm_world, ierr)

   ! Terminal output
   if(mpi_id.eq.0) then
     write(message(1),'(a,i6,a,i6,a,e8.2,a,e8.2,a)') &
      'Time step ',n,' of ',nstep,'; model time ',t_n,' of ',tt_end,'.'
     ! timings
     write(message(2),'(a)') &
       '  CPU timings:'
     write(message(3),elapsed_format) &
       '    total elapsed time:            ',my_second()-t00,'s.'
     write(message(4),elapsed_format) &
       '    last time step:                ',wc_timestep,'s.'
     ! Courant numbers
     vtot = sum(rbuf(10,:))
     write(message(5),'(a)') &
       '  Courant numbers:'
     imax = maxloc(rbuf(2,:),1)
     write(message(6),'(a,e8.2,a,i8,a,i8,a,e8.2)') &
       '    total:         value ',rbuf(2,imax),'; proc ',int(imax), &
       '; elem ',int(rbuf(1,imax)),'; average ', sum(rbuf(3,:))/vtot
     imax = maxloc(rbuf(5,:),1)
     write(message(7),'(a,e8.2,a,i8,a,i8,a,e8.2)') &
       '    advective (1): value ',rbuf(5,imax),'; proc ',int(imax), &
       '; elem ',int(rbuf(4,imax)),'; average ', sum(rbuf(6,:))/vtot
     imax = maxloc(rbuf(8,:),1)
     write(message(8),'(a,e8.2,a,i8,a,i8,a,e8.2)') &
       '    advective (2): value ',rbuf(8,imax),'; proc ',int(imax), &
       '; elem ',int(rbuf(7,imax)),'; average ', sum(rbuf(9,:))/vtot
     nlines = 8
     if(allocated(sd%d1)) then
       write(message(9),'(a)') &
       '  iterative solver diagnostics:'
       write(message(10),'(a,i5)') &
       '    number of iterations:          ',sd%iterations
       ! Make sure we don't exceed the message length
       write(tmp_format,'(a,i5,a)') &
         '(a,' , (len(message(10))-40)/11 , '(e11.3))'
       write(message(11),tmp_format) &
       '    residuals:                     ',sd%d1(:sd%iterations)
       nlines = 11
     endif
     call info(this_prog_name,'',message(1:nlines))
   endif

   deallocate(c_adv,c_tot , sbuf,rbuf)

 end subroutine check_step

 !---------------------------------------------------------------------
 
 !> Time step output
 !!
 !! In this subroutine, the main variables of the code are accessed
 !! directly from the host code; none of them however is changed. The
 !! only exception is \c uuu, which is treated as input argument
 !! so that various time levels can be written.
 subroutine write_step( n, nstep, wc_time, n_out, bname, t_n,         &
                        uuu, im, turbmod_progs, turbmod_diags, dt, sd )
  integer, intent(in) :: n, nstep, n_out
  real(t_realtime), intent(in) :: wc_time
  character(len=*), intent(in) :: bname
  real(wp), intent(in) :: t_n, dt
  real(wp), intent(in) :: uuu(:,:,:), im(:)
  class(c_turbmod_progs), intent(in),    allocatable :: turbmod_progs
  class(c_turbmod_diags), intent(inout), allocatable :: turbmod_diags
  type(t_ti_step_diag), intent(in) :: sd

  integer :: fu, ierr
  real(wp) :: tmp1(0,0,0), tmp2(0)            ! will not be used
  class(c_turbmod_progs), allocatable :: tmp3 ! will not be used
  type(t_bcs_error) :: tmp4                   ! will not beused
  real(wp), allocatable :: c_tot(:), c_adv(:,:)

  character(len=*), parameter :: time_stamp_format = '(i4.4)'
  character(len=4) :: time_stamp
  character(len=*), parameter :: suff1 = '-res-'
  character(len=*), parameter :: suff2 = '.octxt'
  character(len= len_trim(bname) + len(suff1) + 4 + len(suff2)) :: &
     out_file_res_name

   ! Terminal output
   write(message(1),'(a,i7,a,i7,a)') &
     'Time step ',n,' of ',nstep,'; elapsed time '
   write(message(1),elapsed_format) trim(message(1)),wc_time,'s.'
   if(mpi_id.eq.0) &
     call info(this_prog_name,'',message(1:1))

   ! File output
   call new_file_unit(fu,ierr)
   write(time_stamp,time_stamp_format) n_out
   out_file_res_name = trim(bname)//suff1//time_stamp//suff2
   open(fu,file=trim(out_file_res_name), &
        status='replace',action='write',form='formatted',iostat=ierr)
     ! preamble
     call write_octave(test_name       ,'test_name'       ,fu)
     call write_octave(test_description,'test_description',fu)
     call write_octave(t_n             ,'time'            ,fu)
     ! data
     call write_octave(uuu-uuu_io,'uuu',fu)
     call write_octave(im    ,'c','im' ,fu)
     ! additional diagnostics
     call update_flowstate(uuu,base,grid)
     call write_octave(mass_flow     ,'c','mass_flow' ,fu)
     call write_octave(kinetic_energy,'kinetic_energy',fu)
     call dgcomp_tens( tmp1,tmp2,tmp3,tmp4 , grid,base,bcs,        &
                       viscous_flow,ode%turbmod, t_n,uuu,im,       &
                       turbmod_progs, turbmod_diags, td=turb_diags )
     call write_octave(turb_diags%diag_names,'td_names',fu)
     call write_octave(turb_diags%diags,'td_diags',fu)
     ! Courant number
     allocate(c_adv(2,grid%ne),c_tot(grid%ne))
     call compute_courant(c_tot,c_adv,grid,base,dt,uuu)
     call write_octave(c_tot,'r','c_tot',fu)
     call write_octave(c_adv,'c_adv',fu)
     deallocate(c_adv,c_tot)
     ! time integrator diagnostics (not all time integrators)
     if(allocated(sd%d1)) then
       call write_octave(sd%max_iter,'ti_max_iter',fu)
       call write_octave(sd%iterations,'ti_iter',fu)
       call write_octave(sd%d1(:sd%iterations),'r','ti_res',fu)
       call write_octave(sd%d2(:sd%iterations,:sd%iterations),'ti_H', &
                         fu)
     endif
     ! sections
     call write_octave(sections,'sections',fu)
   close(fu,iostat=ierr)

 end subroutine write_step

 !---------------------------------------------------------------------

 !> Write the solution on a selected subset of elements
 !!
 !! The solution is written in a single file by appending each new
 !! time level. Only the preamble is written in octave format, since
 !! there is no easy way to write an array incrementally in octave.
 !!
 !! \note When \c init is set to <tt>.true.<\tt> the input arguments
 !! \c t_n and \c uuu are not used, so that they don't need to be
 !! defined. However, the variables \c grid and \c base need to be
 !! defined.
 !<
 subroutine write_fop_output( bname, t_n, uuu , init )
  character(len=*), intent(in) :: bname
  real(wp), intent(in) :: t_n, uuu(:,:,:)
  logical, intent(in) :: init
 
  integer :: nvars
  character(len=10) :: d_size

  character(len=*), parameter :: delimiter = '#-/DELIMITER\-#'
  character(len=*), parameter :: suff1 = '-selectedres.octxt'
  character(len= len_trim(bname) + len(suff1)) :: out_file_res_name

   out_file_res_name = trim(bname)//suff1

   nvars = 2+grid%d+ntrcs ! total number of variables

   if(init) then
     call new_file_unit(fu,ierr)
     open(fu,file=trim(out_file_res_name), status='replace', &
       action='write',form='formatted')
      ! indexes of the selected elements
      call write_octave(ie_fop,'r','ie_fop',fu)
      ! shape of a data block
      call write_octave((/nvars,base%pk,size(ie_fop)/),'r','DS',fu)
      write(fu,'(a)') delimiter
     close(fu)
     return
   endif
 
   write(d_size,'(i10)') nvars*base%pk*size(ie_fop)
   call new_file_unit(fu,ierr)
   open(fu,file=trim(out_file_res_name), status='old',    &
     action='readwrite',form='formatted',position='append')
    ! current time
    write(fu,'('//real_format//')') t_n
    ! data
    write(fu,'('//d_size//'('//real_format//')'//')') uuu(:,:,ie_fop)
   close(fu)

 end subroutine write_fop_output
 
!----------------------------------------------------------------------
 
 subroutine init_time_stepping()
 ! Set the main time stepping variables in a consistent way on all the
 ! processes.
 
  integer, parameter :: &
    n_ibuff = 3, &
    n_rbuff = 2
  integer ::  intgbuff(n_ibuff)
  real(wp) :: realbuff(n_rbuff)

   if(mpi_id.eq.0) then ! master process
     nstep = ceiling((tt_end-tt_sta)/dt)
     n_out = 0
     n_sot = 1
     t_n               = tt_sta
     time_last_out     = 0.0_wp
     time_last_check   = 0.0_wp
     time_last_stats   = 1.1_wp*dt_stats ! ensure statistics update
     time_last_out_fop = 0.0_wp
   endif

   if(mpi_nd.le.1) return

   if(mpi_id.eq.0) then ! master process

     intgbuff = (/ nstep , n_out , n_sot /)
     realbuff = (/ dt , t_n /)
     call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)

   else

     call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)
     nstep = intgbuff(1)
     n_out = intgbuff(2)
     n_sot = intgbuff(3)
     dt                = realbuff(1)
     t_n               = realbuff(2)

   endif
   
 end subroutine init_time_stepping
 
!----------------------------------------------------------------------

 subroutine init_time_step()
 ! Sets the main time step variables in a consistent way on all the
 ! processes.

  integer, parameter :: &
    n_lbuff = 4, &
    n_ibuff = 2, &
    n_rbuff = 3
  logical ::  logcbuff(n_lbuff)
  integer ::  intgbuff(n_ibuff)
  real(wp) :: realbuff(n_rbuff)

   if(mpi_id.eq.0) then ! master process

     ! main time step variables
     t_nm1 = t_n                  ! t^{n-1}
     t_n = tt_sta + real(n,wp)*dt ! t^n
     if(n.eq.nstep) then ! fix the last time step
       t_n = tt_end
       dt = t_n-t_nm1
       time_last_out = huge(time_last_out)/100.0_wp
     endif

     ! statistics
     time_last_stats = time_last_stats + dt
     if(time_last_stats.ge.dt_stats) then
       l_statstics = .true.
       time_last_stats = time_last_stats - dt_stats
     else
       l_statstics = .false.
     endif

     ! checkpoint
     time_last_check = time_last_check + dt
     if(time_last_check.ge.dt_check) then
       l_checkpoint = .true.
       time_last_check = time_last_check - dt_check
     else
       l_checkpoint = .false.
     endif

     ! output
     time_last_out = time_last_out + dt
     if( (time_last_out.ge.dt_out) .or. &
         (t_n.ge.selected_output_times(n_sot)) ) then
       l_output = .true.
       n_out = n_out+1
       if(time_last_out.ge.dt_out) time_last_out = time_last_out - & 
          dt_out
         if(t_n.ge.selected_output_times(n_sot)) then
         if(n_sot.lt.nn_sot) then
           n_sot = n_sot + 1
         else ! last selected time: redefine it to avoid other outputs
           selected_output_times(n_sot) = 2.0_wp*tt_end
         endif
       endif
     else
       l_output = .false.
     endif
     time_last_out_fop = time_last_out_fop + dt
     ! notice that size(ie_fop) is different on each processor, and in
     ! particular it can be zero on some processors, so that we can
     ! not test it here
     if(time_last_out_fop.ge.dt_fop) then
       l_output_fop = .true.
       time_last_out_fop = time_last_out_fop - dt_fop
     else
       l_output_fop = .false.
     endif

     ! communication
     if(mpi_nd.gt.0) then
       logcbuff = (/ l_statstics,l_checkpoint,l_output,l_output_fop /)
       intgbuff = (/ n_out , n_sot /)
       realbuff = (/ t_nm1 , t_n , dt /)
       call mpi_bcast(logcbuff,n_lbuff,mpi_logical,0,mpi_comm_world, &
             ierr)
       call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world, &
             ierr)
       call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world, &
             ierr)
     endif

   else

     call mpi_bcast(logcbuff,n_lbuff,mpi_logical,0,mpi_comm_world,ierr)
     call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)
     l_statstics  = logcbuff(1)
     l_checkpoint = logcbuff(2)
     l_output     = logcbuff(3)
     l_output_fop = logcbuff(4)
     n_out = intgbuff(1)
     n_sot = intgbuff(2)
     t_nm1                        = realbuff(1)
     t_n                          = realbuff(2)
     dt                           = realbuff(3)

   endif

 end subroutine init_time_step

!----------------------------------------------------------------------

end program dg_comp_main

